Load needed libraries:
#install.packages("readr")
library("readr")
#install.packages('data.table')
library(data.table)
library(stringr)
#install.packages('ggplot2')
library(ggplot2)
library(gridExtra)
library(grid)
library(plyr)
library(purrr)
library(dplyr)
#install.packages('flextable')
library(flextable)
library(tibble)
library(ComplexHeatmap)
library(RColorBrewer)
library(ggplotify)
First lets load the tables (.csv) generated in Qiita
Prepare data frames for healthy samples from American Gut Study subsample:
# Load all AGP data
AGP <- read.csv("~/Desktop/master_project/project_analysis/01_tidy_data/AGP_all.csv", header = TRUE, sep = ",")
# Load healthy samples' table
all_healthy <- read.csv("~/Desktop/master_project/project_analysis/01_tidy_data/AGP_healthy.csv", header = TRUE, sep = ",")
#all_healthy <- all_healthy[1:10]
all_healthy <- select(all_healthy, sample_id, shannon_entropy, chao1, menhinick, margalef, fisher_alpha, simpson, pielou_evenness, gini_index, strong, simpson, faith_pd, sex, race, host_age)
all_healthy$condition <- 'healthy'
all_healthy$qiita_study_id <- AGP$qiita_study_id[match(all_healthy$sample_id, AGP$sample_id)]
names(all_healthy)[names(all_healthy) == 'host_age'] <- 'age'
Load data for Inflammatory Bowel Disease data sets
all_IBD <- read.csv("~/Desktop/master_project/project_analysis/01_tidy_data/IBD_all.csv", header = TRUE, sep = ",")
Load data for Fecal transplant - CDI with underlying IBD data set
trans_IBD_CDI <- read.csv("~/Desktop/master_project/project_analysis/01_tidy_data/all_trans_IBD_CDI.csv", header = TRUE, sep = ",")
Load data for Changes following fecal microbial transplantation for recurrent CDI data set
C_diff_trans <- read.csv("~/Desktop/master_project/project_analysis/01_tidy_data/all_C_diff_trans.csv", header = TRUE, sep = ",")
Load data for Longitudinal Chron’s disease study data set
CD_2 <- read.csv("~/Desktop/master_project/project_analysis/01_tidy_data/all_CD_2.csv", header = TRUE, sep = ",")
Let’s define vector of names of the alpha diversity metrics that are going to be analysed:
metric <- c("shannon_entropy", "chao1", "menhinick", "margalef", "fisher_alpha", "simpson", "gini_index", "strong", "pielou_evenness", "faith_pd" )
#merge two datasets
healthy_disease <- rbind.fill(all_healthy, all_IBD)
healthy_disease$condition <- as.factor(healthy_disease$condition)
healthy_disease$condition <- relevel(healthy_disease$condition, "healthy")
# Sizes of each dataset
table(healthy_disease$condition)
##
## healthy CD UC
## 847 24 40
nrow(healthy_disease)
## [1] 911
head(healthy_disease)
## sample_id shannon_entropy chao1 menhinick margalef fisher_alpha
## 1 10317.000002033 5.652189 203.40000 3.795524 19.963845 40.36435
## 2 10317.000003915 3.992920 222.03030 4.105362 21.604709 44.95118
## 3 10317.000013515 2.888589 87.31250 1.729933 9.024752 14.38912
## 4 10317.000014528 4.912853 140.83333 2.969287 15.588208 29.00431
## 5 10317.000015825 5.420265 142.25000 3.072567 16.135163 30.35482
## 6 10317.000015938 1.990738 70.46154 1.549193 8.067581 12.51383
## simpson pielou_evenness gini_index strong faith_pd sex race
## 1 0.9552311 0.7632090 0.9993042 0.5656463 19.857051 female Caucasian
## 2 0.7977413 0.5246751 0.9995009 0.6729937 18.517975 male Caucasian
## 3 0.7276160 0.4706630 0.9998499 0.7485970 9.413326 male Hispanic
## 4 0.9217813 0.6870763 0.9995303 0.5995362 15.162940 female Caucasian
## 5 0.9563307 0.7578467 0.9994371 0.5678992 13.546122 female Caucasian
## 6 0.4286436 0.3134449 0.9998760 0.7680000 9.802688 female Other
## age condition qiita_study_id age_at_diagnosis
## 1 37.7 healthy 10317 NA
## 2 43.6 healthy 10317 NA
## 3 29.1 healthy 10317 NA
## 4 48.7 healthy 10317 NA
## 5 53.2 healthy 10317 NA
## 6 54.6 healthy 10317 NA
Generate a vector of 10 random colors for histograms:
qual_col_pals <- brewer.pal.info[brewer.pal.info$category == 'qual',]
colors <- unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
histo_IBD <- vector('list', length(metric))
for (i in 1:length(metric)){
histo_IBD[[i]] <- all_IBD %>% ggplot(aes(x = .data[[metric[i]]])) +
geom_histogram(aes(y=..density..), colour="black", fill="white", bins=30)+
geom_density(alpha=.2, fill=colors[i]) +
xlab(label = metric[i]) +
ylab(label = "density")
}
grid.arrange(histo_IBD[[1]], histo_IBD[[2]],histo_IBD[[3]], histo_IBD[[4]],histo_IBD[[5]], histo_IBD[[6]],histo_IBD[[7]], histo_IBD[[8]],histo_IBD[[9]], histo_IBD[[10]], ncol=3, top = textGrob("Distributions of 1) Shannon's index 2) Chao1 3) Menhinick's index \n4) Margalef's index 5) Fisher's index 6) Simpson 7) Gini index 8) Strong's index 9) Pielou's evenness \nand 10) Faith's PD in IBD datasets", gp=gpar(fontsize=10,font=2)))
histo_healthy <- vector('list', length(metric))
for (i in 1:length(metric)){
histo_healthy[[i]] <- all_healthy %>% ggplot(aes(x = .data[[metric[i]]])) +
geom_histogram(aes(y=..density..), colour="black", fill="white", bins=30)+
geom_density(alpha=.2, fill=colors[i]) +
xlab(label = metric[i]) +
ylab(label = "density")
}
grid.arrange(histo_healthy[[1]], histo_healthy[[2]], histo_healthy[[3]], histo_healthy[[4]], histo_healthy[[5]], histo_healthy[[6]], histo_healthy[[7]], histo_healthy[[8]], histo_healthy[[9]], histo_IBD[[10]], ncol=3, top = textGrob("Distributions of 1) Shannon's index 2) Chao1 3) Menhinick's index \n4) Margalef's index 5) Fisher's index 6) Simpson 7) Gini index 8) Strong's index 9) Pielou's evenness \nand 10) Faith's PD in healthy dataset", gp=gpar(fontsize=10,font=2)))
Density plots and Box plots
density <- vector('list', length(metric))
box <- vector('list', length(metric))
for (i in 1:length(metric)){
mean_line <- healthy_disease %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
density[[i]] <- healthy_disease %>% ggplot(aes(x = .data[[metric[i]]], color = condition)) +
geom_density()+
geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+
labs(x = metric[i])
box[[i]] <- healthy_disease %>% ggplot(aes(x = .data[[metric[i]]], color = condition)) +
geom_boxplot() +
labs(x = metric[i])
if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index" & metric[i] != "menhinick"){
density [[i]] <- density [[i]] +
scale_x_continuous(trans = 'log10') +
xlab(paste(metric[i], "(log10)", sep = " "))
box [[i]] <- box [[i]] +
scale_x_continuous(trans = 'log10') +
xlab(paste(metric[i], "(log10)", sep = " "))
}
}
# Show plots
for (j in 1:length(metric)){
grid.arrange(density[[j]], box[[j]], ncol=2, top = paste("Density and box plot comparing healthy vs diseases data for metric:", metric[j], sep=" "))
}
violin <- vector('list', length(metric))
for (i in 1:length(metric)){
mean_line <- healthy_disease %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
plot_data <- healthy_disease %>% dplyr::group_by(condition) %>% dplyr::mutate(m = mean(.data[[metric[i]]]))
violin[[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(condition, -m), color = condition, fill = condition)) +
geom_violin()+
geom_boxplot(width=0.1, color="grey", alpha=0.2) +
geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+
labs(x = metric[i])+
ylab("")+
#ggtitle("A Violin plot of distributions of alpha metrics in different conditions")+
theme(legend.position="none")
if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index" & metric[i] != "menhinick"){
violin [[i]] <- violin [[i]] +
scale_x_continuous(trans = 'log10') +
xlab(paste(metric[i], "(log10)", sep = " "))
}
}
# Show plots
#violin
grid.arrange(violin[[1]], violin[[2]],violin[[3]], violin[[4]],violin[[5]], violin[[6]],violin[[7]], violin[[8]],violin[[9]], violin[[10]], ncol=4, top = textGrob("Distributions of 1) Shannon's index 2) Chao1 3) Menhinick's index \n4) Margalef's index 5) Fisher's index 6) Simpson 7) Gini index 8) Strong's index 9) Pielou's evenness \nand 10) Faith's PD in IBD datasets", gp=gpar(fontsize=10,font=2)))
Non-parametric test (does not assume normal distribution)
test_helathy_IBD <- list()
for (i in 1:length(metric)){
test_helathy_IBD[[i]] <- pairwise.wilcox.test(healthy_disease[[metric[i]]], healthy_disease$condition, p.adjust.method="none") %>%
broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
test_helathy_IBD[[i]]$p.value <- round(test_helathy_IBD[[i]]$p.value, digits = 17)
}
tests <- do.call(what = rbind, args = test_helathy_IBD)
table1 <- tests %>%
add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
flextable() %>%
bold(~ p.value < 0.05, 4) %>%
bold(~ p.adjusted < 0.05, 5) %>%
add_header_lines(values = "Results of the Wilcox test for distributions of different conditions")
table2 <- tests %>%
add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
arrange(p.value) %>%
flextable() %>%
bold(~ p.value < 0.05, 4) %>%
bold(~ p.adjusted < 0.05, 5) %>%
add_header_lines(values = "Results of the Wilcox test for distributions of different conditions")
Orered by name:
table1
Results of the Wilcox test for distributions of different conditions | ||||
parameter | group1 | group2 | p.value | p.adjusted |
shannon_entropy | CD | healthy | 0.62590500578120378 | 0.782381257226504756 |
shannon_entropy | UC | healthy | 0.03666221756548456 | 0.064698030997913922 |
shannon_entropy | UC | CD | 0.31721308809088172 | 0.472593462718414126 |
chao1 | CD | healthy | 0.83156926291337108 | 0.950502117540572455 |
chao1 | UC | healthy | 0.00000257299774203 | 0.000011027133180129 |
chao1 | UC | CD | 0.00122770386886685 | 0.003683111606600550 |
menhinick | CD | healthy | 0.00009695761447705 | 0.000323192048256833 |
menhinick | UC | healthy | 0.00000000000004517 | 0.000000000000338775 |
menhinick | UC | CD | 0.00292869423612998 | 0.006275773363135672 |
margalef | CD | healthy | 0.97735342279918436 | 0.983532178450231886 |
margalef | UC | healthy | 0.00001128001083759 | 0.000042300040640962 |
margalef | UC | CD | 0.00292869423612998 | 0.006275773363135672 |
fisher_alpha | CD | healthy | 0.29813159511134640 | 0.470734097544231178 |
fisher_alpha | UC | healthy | 0.00000012330086912 | 0.000000616504345600 |
fisher_alpha | UC | CD | 0.00292869423612998 | 0.006275773363135672 |
simpson | CD | healthy | 0.88713530970453436 | 0.950502117540572455 |
simpson | UC | healthy | 0.47994170723394008 | 0.626010922479052323 |
simpson | UC | CD | 0.45263274746491944 | 0.617226473815799204 |
gini_index | CD | healthy | 0.00000000000000006 | 0.000000000000000900 |
gini_index | UC | healthy | 0.00000000000000000 | 0.000000000000000000 |
gini_index | UC | CD | 0.09214651379693838 | 0.153577522994897298 |
strong | CD | healthy | 0.00277485741816494 | 0.006275773363135672 |
strong | UC | healthy | 0.00437329418703360 | 0.008746588374067200 |
strong | UC | CD | 0.33081542390288987 | 0.472593462718414126 |
pielou_evenness | CD | healthy | 0.80535452221914239 | 0.950502117540572455 |
pielou_evenness | UC | healthy | 0.87131032043343293 | 0.950502117540572455 |
pielou_evenness | UC | CD | 0.98353217845023189 | 0.983532178450231886 |
faith_pd | CD | healthy | 0.00000000000000038 | 0.000000000000003800 |
faith_pd | UC | healthy | 0.00955325861339085 | 0.017912359900107852 |
faith_pd | UC | CD | 0.00000000072599039 | 0.000000004355942340 |
Ordered by signifficance:
table2
Results of the Wilcox test for distributions of different conditions | ||||
parameter | group1 | group2 | p.value | p.adjusted |
gini_index | UC | healthy | 0.00000000000000000 | 0.000000000000000000 |
gini_index | CD | healthy | 0.00000000000000006 | 0.000000000000000900 |
faith_pd | CD | healthy | 0.00000000000000038 | 0.000000000000003800 |
menhinick | UC | healthy | 0.00000000000004517 | 0.000000000000338775 |
faith_pd | UC | CD | 0.00000000072599039 | 0.000000004355942340 |
fisher_alpha | UC | healthy | 0.00000012330086912 | 0.000000616504345600 |
chao1 | UC | healthy | 0.00000257299774203 | 0.000011027133180129 |
margalef | UC | healthy | 0.00001128001083759 | 0.000042300040640962 |
menhinick | CD | healthy | 0.00009695761447705 | 0.000323192048256833 |
chao1 | UC | CD | 0.00122770386886685 | 0.003683111606600550 |
strong | CD | healthy | 0.00277485741816494 | 0.006275773363135672 |
menhinick | UC | CD | 0.00292869423612998 | 0.006275773363135672 |
margalef | UC | CD | 0.00292869423612998 | 0.006275773363135672 |
fisher_alpha | UC | CD | 0.00292869423612998 | 0.006275773363135672 |
strong | UC | healthy | 0.00437329418703360 | 0.008746588374067200 |
faith_pd | UC | healthy | 0.00955325861339085 | 0.017912359900107852 |
shannon_entropy | UC | healthy | 0.03666221756548456 | 0.064698030997913922 |
gini_index | UC | CD | 0.09214651379693838 | 0.153577522994897298 |
fisher_alpha | CD | healthy | 0.29813159511134640 | 0.470734097544231178 |
shannon_entropy | UC | CD | 0.31721308809088172 | 0.472593462718414126 |
strong | UC | CD | 0.33081542390288987 | 0.472593462718414126 |
simpson | UC | CD | 0.45263274746491944 | 0.617226473815799204 |
simpson | UC | healthy | 0.47994170723394008 | 0.626010922479052323 |
shannon_entropy | CD | healthy | 0.62590500578120378 | 0.782381257226504756 |
pielou_evenness | CD | healthy | 0.80535452221914239 | 0.950502117540572455 |
chao1 | CD | healthy | 0.83156926291337108 | 0.950502117540572455 |
pielou_evenness | UC | healthy | 0.87131032043343293 | 0.950502117540572455 |
simpson | CD | healthy | 0.88713530970453436 | 0.950502117540572455 |
margalef | CD | healthy | 0.97735342279918436 | 0.983532178450231886 |
pielou_evenness | UC | CD | 0.98353217845023189 | 0.983532178450231886 |
An FDR-adjusted p-value (also called q-value) of 0.05 indicates that 5% of significant tests will result in false positives. In other words, an FDR of 5% means that, among all results called significant, only 5% of these are truly null.
# Do Wilcox test to see which parameters show significant differenc between healthy and unhealthy samples
wilcox_healthy_disease <- healthy_disease %>%
summarise(Shannon = wilcox.test(shannon_entropy[condition == "healthy"], shannon_entropy[condition != "healthy"])$p.value,
Chao1 = wilcox.test(chao1[condition == "healthy"], chao1[condition != "healthy"])$p.value,
Menhinick = wilcox.test(menhinick[condition == "healthy"], menhinick[condition != "healthy"])$p.value,
Margalef = wilcox.test(margalef[condition == "healthy"], margalef[condition != "healthy"])$p.value,
Pielou = wilcox.test(pielou_evenness[condition == "healthy"], pielou_evenness[condition != "healthy"])$p.value,
Fisher = wilcox.test(fisher_alpha[condition == "healthy"], fisher_alpha[condition != "healthy"])$p.value,
Gini = wilcox.test(gini_index[condition == "healthy"], gini_index[condition != "healthy"])$p.value,
Strong = wilcox.test(strong[condition == "healthy"], strong[condition != "healthy"])$p.value,
Faith = wilcox.test(faith_pd[condition == "healthy"], faith_pd[condition != "healthy"])$p.value)
wilcox_healthy_disease <- t(wilcox_healthy_disease)
colnames(wilcox_healthy_disease) <- c("p.value")
wilcox_healthy_disease <- data.frame(Alpha_Metric = row.names(wilcox_healthy_disease), wilcox_healthy_disease)
wilcox_healthy_disease$p.value <- round(wilcox_healthy_disease$p.value, digits = 17)
wilcox_healthy_disease %>%
flextable() %>%
bold(~ p.value < 0.05, 2) %>%
add_header_lines(values = "Results of the Wilcox test for distributions of different parameters between healthy and unhealthy samples")
Results of the Wilcox test for distributions of different parameters between healthy and unhealthy samples | |
Alpha_Metric | p.value |
Shannon | 0.0545836309232159 |
Chao1 | 0.0003991907280241 |
Menhinick | 0.0000000000000002 |
Margalef | 0.0005759587582896 |
Pielou | 0.7839507078132606 |
Fisher | 0.0000020460847190 |
Gini | 0.0000000000000000 |
Strong | 0.0000594917447862 |
Faith | 0.0042928934443810 |
test of the differentiation across multiple categories, analogous to one-way ANOVA
kruskal_results <- healthy_disease %>%
summarise(Shannon = kruskal.test(healthy_disease$shannon_entropy ~ healthy_disease$condition)$p.value,
Chao1 = kruskal.test(healthy_disease$chao1 ~ healthy_disease$condition)$p.value,
Fisher = kruskal.test(healthy_disease$fisher_alpha ~ healthy_disease$condition)$p.value,
Margalef =kruskal.test(healthy_disease$margalef ~ healthy_disease$condition)$p.value,
Simpson = kruskal.test(healthy_disease$simpson ~ healthy_disease$condition)$p.value,
Menhinick = kruskal.test(healthy_disease$menhinick ~ healthy_disease$condition)$p.value,
Pielou = kruskal.test(healthy_disease$pielou_evenness ~ healthy_disease$condition)$p.value,
Gini = kruskal.test(healthy_disease$gini_index ~ healthy_disease$condition)$p.value,
Strong = kruskal.test(healthy_disease$strong ~ healthy_disease$condition)$p.value,
Faith = kruskal.test(healthy_disease$faith_pd ~ healthy_disease$condition)$p.value)
kruskal_results <- as.data.frame(t(kruskal_results))
colnames(kruskal_results) <- c("p.value")
kruskal_results <- data.frame(Alpha_Metric = row.names(kruskal_results), kruskal_results)
kruskal_results %>%
flextable() %>%
bold(~ p.value < 0.05, 2) %>%
add_header_lines(values = "Results of the Kruskal-Wallis test for differentiation of different parameters across different conditions")
Results of the Kruskal-Wallis test for differentiation of different parameters across different conditions | |
Alpha_Metric | p.value |
Shannon | 0.102673092280582051172288515772379469126462936 |
Chao1 | 0.000014418427585049017265500688467216860999542 |
Fisher | 0.000000533593785303428027878532360905872167223 |
Margalef | 0.000061097615301116482743395974530642433819594 |
Simpson | 0.761771254970544320350711586797842755913734436 |
Menhinick | 0.000000000000000497754805441245181173768753198 |
Pielou | 0.959034190753235926685249523870879784226417542 |
Gini | 0.000000000000000000000000000000000000001888436 |
Strong | 0.000257451010580216457009766761743208007828798 |
Faith | 0.000000000000000260001372008567299599876340898 |
for(x in 2:11){
#for(x in 2:10){
heatmap_h <- densityHeatmap(as.matrix(all_healthy[,x]), ylab = colnames(all_healthy)[x], title ="healthy")
heatmap_d <- densityHeatmap(as.matrix(all_IBD[,x]), ylab = colnames(all_IBD)[x], title = "disease")
heatmap_hd <- densityHeatmap(as.matrix(healthy_disease[,x]), ylab = colnames(healthy_disease)[x], title = "healthy + disease")
gl <- lapply(list(heatmap_h, heatmap_d, heatmap_hd), as.grob)
grid.arrange(grobs=gl, ncol=3, clip=TRUE)
}
table(CD_2$description, CD_2$condition)
##
## control crohns
## father family 01-00010 27 0
## father family 01-00011 20 0
## father family 01-00012 54 0
## father family 01-00013 27 0
## fecal sample index pt 20 0 20
## fecal sample index pt 21 0 25
## fecal sample index pt 23 0 21
## fecal sample index pt 24 0 26
## fecal sample index pt 25 0 14
## fecal sample index pt 26 0 16
## fecal sample index pt 27 0 14
## fecal sample index pt 31 0 10
## fecal sample index pt 32 0 22
## fecal sample index pt 33 0 20
## fecal sample index pt 34 0 2
## fecal sample index pt 35 0 14
## fecal sample index pt 36 0 9
## fecal sample index pt 37 0 22
## fecal sample index pt 39 0 26
## index family 01-00012 0 48
## index family 01-00013 0 26
## mother family 01-00010 25 0
## mother family 01-00011 23 0
## mother family 01-00012 57 0
## mother family 01-00013 27 0
## sibling family 01-00010 18 0
## sibling family 01-00012 55 0
## sibling family 01-00013 20 0
table(CD_2$condition)
##
## control crohns
## 353 335
table(CD_2$surgery_and_ibd)
##
## control crohns crohns (surgery)
## 353 173 162
violin_CD_2a <- vector('list', length(metric))
for (i in 1:length(metric)){
mean_line <- CD_2 %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))
violin_CD_2a [[i]] <- CD_2 %>% ggplot(aes(x = .data[[metric[i]]], y = condition, color = condition, fill = condition)) +
geom_violin()+
geom_boxplot(width=0.1, color="grey", alpha=0.2) +
geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+
labs(x = metric[i])+
ylab("") +
theme(legend.position="none")
if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index" & metric[i] != "menhinick"){
violin_CD_2a [[i]] <- violin_CD_2a [[i]] +
scale_x_continuous(trans = 'log10') +
xlab(paste(metric[i], "(log10)", sep = " "))
}
}
#plots for Shannon entropy
violin_CD_2a
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
grid.arrange(violin_CD_2a[[1]], violin_CD_2a[[2]],violin_CD_2a[[3]], violin_CD_2a[[4]],violin_CD_2a[[5]], violin_CD_2a[[6]],violin_CD_2a[[7]], violin_CD_2a[[8]],violin_CD_2a[[9]],violin_CD_2a[[10]], ncol=4, top = textGrob("Distributions of 1) Shannon's index 2) Chao1 3) Menhinick's index \n4) Margalef's index 5) Fisher's index 6) Simpson 7) Gini index 8) Strong's index 9) Pielou's evenness \nand 10) Faith's PD in IBD datasets", gp=gpar(fontsize=10,font=2)))
violin_CD_2b <- vector('list', length(metric))
for (i in 1:length(metric)){
mean_line <- CD_2 %>% dplyr::group_by(surgery_and_ibd) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))
violin_CD_2b [[i]] <- CD_2 %>% ggplot(aes(x = .data[[metric[i]]], y = surgery_and_ibd, color = surgery_and_ibd, fill = surgery_and_ibd)) +
geom_violin()+
geom_boxplot(width=0.1, color="grey", alpha=0.2) +
geom_vline(data = mean_line, aes(xintercept = grp_mean, color = surgery_and_ibd), linetype = "dashed")+
labs(x = metric[i])+
ylab("") +
theme(legend.position="none")
if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index" & metric[i] != "menhinick"){
violin_CD_2b [[i]] <- violin_CD_2b [[i]] +
scale_x_continuous(trans = 'log10') +
xlab(paste(metric[i], "(log10)", sep = " "))
}
}
#plots for Shannon entropy
violin_CD_2b
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
violin_CD_2_surg <- vector('list', length(metric))
for (i in 1:length(metric)){
mean_line <- CD_2 %>% dplyr::group_by(surgery_type) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))
plot_data <- CD_2 %>% dplyr::group_by(surgery_type) %>% dplyr::mutate(m = mean(.data[[metric[i]]]))
violin_CD_2_surg [[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(surgery_type, -m), color = surgery_type, fill = surgery_type)) +
geom_violin()+
geom_boxplot(width=0.1, color="grey", alpha=0.2) +
geom_vline(data = mean_line, aes(xintercept = grp_mean, color = surgery_type), linetype = "dashed")+
labs(x = metric[i])+
ylab("") +
theme(legend.position="none")
if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index" & metric[i] != "menhinick"){
violin_CD_2_surg [[i]] <- violin_CD_2_surg [[i]] +
scale_x_continuous(trans = 'log10') +
xlab(paste(metric[i], "(log10)", sep = " "))
}
}
#plots for Shannon entropy
violin_CD_2_surg
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
Now let’s check the difference between this study and previous one measuring diversity in Crohn’s disease:
CD_1 <- healthy_disease[healthy_disease$condition != "UC",]
CD_2_surg <- CD_2
CD_2_surg$condition <- NULL
names(CD_2_surg)[names(CD_2_surg) == 'surgery_and_ibd'] <- 'condition'
#CD_check <- rbind.fill(CD_1, CD_2)
CD_check <- rbind.fill(CD_1, CD_2_surg)
CD_check$condition[CD_check$condition == "CD"] <-'CD_1'
CD_check$condition[CD_check$condition == "crohns"] <-'CD_2'
CD_check$condition[CD_check$condition == "crohns (surgery)"] <-'CD_2_surgery'
CD_check$condition[CD_check$condition == "healthy"] <-'control(AGP)'
CD_check$condition[CD_check$condition == "control"] <-'control_2'
violin_CD_check <- vector('list', length(metric))
for (i in 1:length(metric)){
mean_line <- CD_check %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))
violin_CD_check [[i]] <- CD_check %>% ggplot(aes(x = .data[[metric[i]]], y = condition, color = condition, fill = condition)) +
geom_violin()+
geom_boxplot(width=0.1, color="grey", alpha=0.2) +
geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+
scale_y_discrete(limits=c("CD_2", "control_2", "CD_2_surgery", "CD_1", "control(AGP)")) +
labs(x = metric[i])+
ylab("") +
theme(legend.position="none")
if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index" & metric[i] != "menhinick"){
violin_CD_check [[i]] <- violin_CD_check [[i]] +
scale_x_continuous(trans = 'log10') +
xlab(paste(metric[i], "(log10)", sep = " "))
}
}
#plots for Shannon entropy
violin_CD_check
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
test_CD_1 <- list()
test_CD_2 <- list()
for (i in 1:length(metric)){
test_CD_1[[i]] <- pairwise.wilcox.test(CD_1[[metric[i]]], CD_1$condition, p.adjust.method="none") %>%
broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
test_CD_1[[i]]$p.value <- round(test_CD_1[[i]]$p.value, digits = 17)
test_CD_2[[i]] <- pairwise.wilcox.test(CD_2[[metric[i]]], CD_2$surgery_and_ibd, p.adjust.method="none") %>%
broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
test_CD_2[[i]]$p.value <- round(test_CD_2[[i]]$p.value, digits = 17)
}
tests1 <- do.call(what = rbind, args = test_CD_1)
tests2 <- do.call(what = rbind, args = test_CD_2)
table_CD_1 <- tests1 %>%
add_column(p.adjusted = p.adjust(tests1$p.value, "fdr"), .after='p.value') %>%
arrange(p.value, parameter) %>%
flextable() %>%
bold(~ p.value < 0.05, 4) %>%
bold(~ p.adjusted < 0.05, 5) %>%
add_header_lines(values = "Results of the Wilcox test for distributions of different conditions in CD_1 dataset")
table_CD_2 <- tests2 %>%
add_column(p.adjusted = p.adjust(tests2$p.value, "fdr"), .after='p.value') %>%
arrange(p.value, parameter) %>%
flextable() %>%
bold(~ p.value < 0.05, 4) %>%
bold(~ p.adjusted < 0.05, 5) %>%
add_header_lines(values = "Results of the Wilcoxon test for distributions of different conditions in CD_2 dataset")
Results of the Wilcoxon test for diversity distributions of healthy and CD samples in first study:
table_CD_1
Results of the Wilcox test for distributions of different conditions in CD_1 dataset | ||||
parameter | group1 | group2 | p.value | p.adjusted |
gini_index | CD | healthy | 0.00000000000000006 | 0.0000000000000006 |
faith_pd | CD | healthy | 0.00000000000000038 | 0.0000000000000019 |
menhinick | CD | healthy | 0.00009695761447705 | 0.0003231920482568 |
strong | CD | healthy | 0.00277485741816494 | 0.0069371435454124 |
fisher_alpha | CD | healthy | 0.29813159511134640 | 0.5962631902226928 |
shannon_entropy | CD | healthy | 0.62590500578120378 | 0.9773534227991844 |
pielou_evenness | CD | healthy | 0.80535452221914239 | 0.9773534227991844 |
chao1 | CD | healthy | 0.83156926291337108 | 0.9773534227991844 |
simpson | CD | healthy | 0.88713530970453436 | 0.9773534227991844 |
margalef | CD | healthy | 0.97735342279918436 | 0.9773534227991844 |
Results of the Wilcoxon test for diversity distributions of healthy and CD samples in second (longitudinal) study:
table_CD_2
Results of the Wilcoxon test for distributions of different conditions in CD_2 dataset | ||||
parameter | group1 | group2 | p.value | p.adjusted |
faith_pd | crohns (surgery) | control | 0.00000000000000000 | 0.00000000000000000000000 |
fisher_alpha | crohns (surgery) | control | 0.00000000000000000 | 0.00000000000000000000000 |
gini_index | crohns (surgery) | control | 0.00000000000000000 | 0.00000000000000000000000 |
margalef | crohns (surgery) | control | 0.00000000000000000 | 0.00000000000000000000000 |
menhinick | crohns (surgery) | control | 0.00000000000000000 | 0.00000000000000000000000 |
pielou_evenness | crohns (surgery) | control | 0.00000000000000000 | 0.00000000000000000000000 |
shannon_entropy | crohns (surgery) | control | 0.00000000000000000 | 0.00000000000000000000000 |
simpson | crohns (surgery) | control | 0.00000000000000000 | 0.00000000000000000000000 |
chao1 | crohns (surgery) | control | 0.00000000000000001 | 0.00000000000000003333333 |
strong | crohns (surgery) | control | 0.00000000000070725 | 0.00000000000212175000000 |
faith_pd | crohns (surgery) | crohns | 0.00000000000412302 | 0.00000000001124460000000 |
gini_index | crohns (surgery) | crohns | 0.00000000019662471 | 0.00000000049156177500000 |
fisher_alpha | crohns (surgery) | crohns | 0.00000000051815197 | 0.00000000103630394000000 |
margalef | crohns (surgery) | crohns | 0.00000000051815197 | 0.00000000103630394000000 |
menhinick | crohns (surgery) | crohns | 0.00000000051815197 | 0.00000000103630394000000 |
shannon_entropy | crohns (surgery) | crohns | 0.00000000272429143 | 0.00000000510804643125000 |
chao1 | crohns (surgery) | crohns | 0.00000001017126923 | 0.00000001794929864117647 |
pielou_evenness | crohns (surgery) | crohns | 0.00000536780985689 | 0.00000894634976148333456 |
simpson | crohns (surgery) | crohns | 0.00000600629169141 | 0.00000948361846012105263 |
simpson | crohns | control | 0.00002013513551602 | 0.00003020270327402999801 |
pielou_evenness | crohns | control | 0.00003537721365162 | 0.00005053887664517142908 |
strong | crohns | control | 0.00026284991838079 | 0.00035843170688289540966 |
strong | crohns (surgery) | crohns | 0.00037910329096575 | 0.00049448255343358701701 |
shannon_entropy | crohns | control | 0.00077522271035731 | 0.00096902838794663752157 |
gini_index | crohns | control | 0.01750412729110927 | 0.02100495274933112180293 |
chao1 | crohns | control | 0.06579526636831212 | 0.07591761504036013963326 |
faith_pd | crohns | control | 0.10341083464326625 | 0.11490092738140694761384 |
fisher_alpha | crohns | control | 0.21387405361949660 | 0.21387405361949660131948 |
margalef | crohns | control | 0.21387405361949660 | 0.21387405361949660131948 |
menhinick | crohns | control | 0.21387405361949660 | 0.21387405361949660131948 |
CD_check_w <- CD_check %>% filter(CD_check$condition != "control(AGP)" & CD_check$condition != "control_2" )
test_CD_3 <- list()
for (i in 1:length(metric)){
test_CD_3[[i]] <- pairwise.wilcox.test(CD_check_w[[metric[i]]], CD_check_w$condition, p.adjust.method="none") %>%
broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
test_CD_3[[i]]$p.value <- round(test_CD_3[[i]]$p.value, digits = 17)
}
tests <- do.call(what = rbind, args = test_CD_3)
table_CD_3 <- tests %>%
add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
arrange(p.value, parameter) %>%
flextable() %>%
bold(~ p.value < 0.05, 4) %>%
bold(~ p.adjusted < 0.05, 5) %>%
add_header_lines(values = "Results of the Wilcox test for distributions of different groups of Crhohn's disease patients")
table_CD_3
Results of the Wilcox test for distributions of different groups of Crhohn's disease patients | ||||
parameter | group1 | group2 | p.value | p.adjusted |
faith_pd | CD_2_surgery | CD_1 | 0.00000000000000318 | 0.000000000000073700 |
faith_pd | CD_2 | CD_1 | 0.00000000000000700 | 0.000000000000073700 |
gini_index | CD_2_surgery | CD_1 | 0.00000000000000737 | 0.000000000000073700 |
gini_index | CD_2 | CD_1 | 0.00000000000002937 | 0.000000000000220275 |
faith_pd | CD_2_surgery | CD_2 | 0.00000000000412302 | 0.000000000024738120 |
gini_index | CD_2_surgery | CD_2 | 0.00000000019662471 | 0.000000000983123550 |
fisher_alpha | CD_2_surgery | CD_2 | 0.00000000051815197 | 0.000000001727173233 |
margalef | CD_2_surgery | CD_2 | 0.00000000051815197 | 0.000000001727173233 |
menhinick | CD_2_surgery | CD_2 | 0.00000000051815197 | 0.000000001727173233 |
shannon_entropy | CD_2_surgery | CD_2 | 0.00000000272429143 | 0.000000008172874290 |
chao1 | CD_2_surgery | CD_2 | 0.00000001017126923 | 0.000000027739825173 |
pielou_evenness | CD_2_surgery | CD_2 | 0.00000536780985689 | 0.000013419524642225 |
simpson | CD_2_surgery | CD_2 | 0.00000600629169141 | 0.000013860673134023 |
shannon_entropy | CD_2_surgery | CD_1 | 0.00001835318609066 | 0.000039328255908557 |
simpson | CD_2_surgery | CD_1 | 0.00004727241478514 | 0.000094544829570280 |
margalef | CD_2_surgery | CD_1 | 0.00010355431737318 | 0.000194164345074712 |
fisher_alpha | CD_2_surgery | CD_1 | 0.00013726797722562 | 0.000242237606868741 |
chao1 | CD_2_surgery | CD_1 | 0.00024198165016724 | 0.000403302750278733 |
strong | CD_2_surgery | CD_2 | 0.00037910329096575 | 0.000598584143630132 |
menhinick | CD_2_surgery | CD_1 | 0.00060108041699576 | 0.000901620625493640 |
pielou_evenness | CD_2_surgery | CD_1 | 0.00090946322043473 | 0.001299233172049614 |
strong | CD_2_surgery | CD_1 | 0.06721818839753790 | 0.091661165996642591 |
simpson | CD_2 | CD_1 | 0.08731955396999834 | 0.113895070395650014 |
shannon_entropy | CD_2 | CD_1 | 0.28215387991398205 | 0.352692349892477552 |
pielou_evenness | CD_2 | CD_1 | 0.31590796277536898 | 0.379089555330442751 |
margalef | CD_2 | CD_1 | 0.47848035469736533 | 0.538275341934925544 |
chao1 | CD_2 | CD_1 | 0.48444780774143292 | 0.538275341934925544 |
fisher_alpha | CD_2 | CD_1 | 0.53972418968600622 | 0.578275917520720939 |
menhinick | CD_2 | CD_1 | 0.84999324337675142 | 0.879303355217329163 |
strong | CD_2 | CD_1 | 0.98933097188722530 | 0.989330971887225297 |
From the paper: “A recent study suggests significantly lower response of CDI to FMT in patients with underlying inflammatory bowel disease (IBD). We have also previously described a higher rate of recurrence of CDI following FMT in patients with CDI and underlying IBD”
colnames(trans_IBD_CDI)
## [1] "sample_id" "shannon_entropy"
## [3] "chao1" "fisher_alpha"
## [5] "margalef" "gini_index"
## [7] "menhinick" "strong"
## [9] "simpson" "faith_pd"
## [11] "pielou_evenness" "animations_subject"
## [13] "sex" "age"
## [15] "body_mass_index" "condition"
## [17] "day_since_fmt" "donor_or_patient"
## [19] "number_recurrence_after_fmt" "qiita_study_id"
table(trans_IBD_CDI$condition)
##
## CDI CDI + CD CDI + UC Not applicable
## 39 16 12 28
table(trans_IBD_CDI$day_since_fmt)
##
## -1 28 7 NA-Donor no_data
## 23 18 22 30 2
trans_IBD_CDI_1 <- trans_IBD_CDI %>%
filter(day_since_fmt != "no_data" ) %>%
filter(!(donor_or_patient == "Donor" & condition=="CDI"))
violin_trans <- vector('list', length(metric))
for (i in 1:length(metric)){
mean_line <- trans_IBD_CDI_1 %>% dplyr::group_by(condition, day_since_fmt) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))
violin_trans[[i]] <- trans_IBD_CDI_1 %>% ggplot(aes(x = .data[[metric[i]]], y = condition, color = condition, fill = condition)) +
geom_violin()+
geom_boxplot(width=0.1, color="grey", alpha=0.2) +
geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+
labs(x = metric[i])+
ylab("") +
facet_wrap(vars(day_since_fmt))+
theme(legend.position="none")
if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index" & metric[i] != "menhinick"){
violin_trans [[i]] <- violin_trans [[i]] +
scale_x_continuous(trans = 'log10') +
xlab(paste(metric[i], "(log10)", sep = " "))
}
}
#plots for Shannon entropy
violin_trans
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
cond <- c("CDI + UC", "CDI", "CDI + CD")
test_CDI_trans <- list()
table <- list()
for (i in 1:length(cond)){
trans_IBD_CDI_2 <- trans_IBD_CDI_1 %>%
filter(condition == cond[i])
for (j in 1:length(metric)){
test_CDI_trans[[j]] <- pairwise.wilcox.test(trans_IBD_CDI_2[[metric[j]]], trans_IBD_CDI_2$day_since_fmt, p.adjust.method="none") %>%
broom::tidy() %>% add_column(parameter = metric[j], .before='group1')
test_CDI_trans[[j]]$p.value <- round(test_CDI_trans[[j]]$p.value, digits = 17)
}
tests <- do.call(what = rbind, args = test_CDI_trans)
table <- tests %>%
add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
flextable() %>%
bold(~ p.value < 0.05, 4) %>%
bold(~ p.adjusted < 0.05, 5) %>%
add_header_lines(values = paste("Results of the Wilcox test for condition:", cond[i], sep = " "))
print(table)
test_CDI_trans <- list()
}
## a flextable object.
## col_keys: `parameter`, `group1`, `group2`, `p.value`, `p.adjusted`
## header has 2 row(s)
## body has 30 row(s)
## original dataset sample:
## parameter group1 group2 p.value p.adjusted
## 1 shannon_entropy 28 -1 0.20000000 0.40000000
## 2 shannon_entropy 7 -1 0.02857143 0.08571429
## 3 shannon_entropy 7 28 0.68571429 0.97959184
## 4 chao1 28 -1 0.02857143 0.08571429
## 5 chao1 7 -1 0.05714286 0.13186813
## a flextable object.
## col_keys: `parameter`, `group1`, `group2`, `p.value`, `p.adjusted`
## header has 2 row(s)
## body has 30 row(s)
## original dataset sample:
## parameter group1 group2 p.value p.adjusted
## 1 shannon_entropy 28 -1 2.048483e-03 4.727269e-03
## 2 shannon_entropy 7 -1 2.502252e-03 5.361969e-03
## 3 shannon_entropy 7 28 8.078046e-01 8.975606e-01
## 4 chao1 28 -1 1.713188e-05 8.565939e-05
## 5 chao1 7 -1 2.070886e-07 1.553165e-06
## a flextable object.
## col_keys: `parameter`, `group1`, `group2`, `p.value`, `p.adjusted`
## header has 2 row(s)
## body has 30 row(s)
## original dataset sample:
## parameter group1 group2 p.value p.adjusted
## 1 shannon_entropy 28 -1 0.031746032 0.0952381
## 2 shannon_entropy 7 -1 0.030303030 0.0952381
## 3 shannon_entropy 7 28 1.000000000 1.0000000
## 4 chao1 28 -1 0.059327061 0.1618011
## 5 chao1 7 -1 0.007969413 0.0952381
table(C_diff_trans$disease_state)
##
## post-FMT Pre-FMT
## 91 4
table(C_diff_trans$day_since_fmt)
##
## -18 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1 1 1 1 2 3 2 2 2 3 2 3 3 2 3 3 2 3 3 2
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 35 36 40 42
## 3 3 2 3 2 3 2 2 1 3 2 1 1 1 1 1 1 1 1 2
## 47 49 55 56 63 70 77 84 151
## 1 2 1 2 2 2 2 2 1
table(C_diff_trans$animations_subject)
##
## CD1 CD2 CD4
## 36 23 36
C_diff_trans$animations_subject[C_diff_trans$animations_subject == "CD1"] <-'subject_1'
C_diff_trans$animations_subject[C_diff_trans$animations_subject == "CD2"] <-'subject_2'
C_diff_trans$animations_subject[C_diff_trans$animations_subject == "CD4"] <-'subject_3'
progression <- vector('list', length(metric))
for (i in 1:length(metric)){
progression[[i]] <- C_diff_trans %>% ggplot(aes(x=day_since_fmt, y= .data[[metric[i]]], group=animations_subject)) +
geom_line(aes(color=animations_subject))+
geom_point(aes(color=animations_subject))+
facet_wrap(vars(animations_subject), scale="free", ncol=2)
}
progression
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
before_trans <- trans_IBD_CDI %>%
filter(day_since_fmt != c("7","28", "no_data", "NA-Donor"),
condition != "Not applicable")
#healthy_disease_2 <- rbind.fill(healthy_disease, before_trans, CD_2)
healthy_disease_2 <- rbind.fill(healthy_disease, before_trans)
# Sizes of each dataset
table(healthy_disease_2$condition)
##
## CD CDI CDI + CD CDI + UC healthy UC
## 24 30 11 9 847 40
nrow(healthy_disease)
## [1] 911
ggplot(healthy_disease_2, aes(x = faith_pd)) +
geom_histogram(aes(y=..density..), colour="black", fill="white", bins=30) +
geom_density (alpha=.2, fill="#009E73") +
xlab(label = "faith_pd") +
ylab(label = "density") +
scale_x_continuous(trans = 'log10') +
facet_wrap(vars(condition))
*The values on y scale are densities instead of frequencies (what percentage of observations in a dataset fall between different values)
violin_all <- vector('list', length(metric))
for (i in 1:length(metric)){
mean_line <- healthy_disease_2 %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
plot_data <- healthy_disease_2 %>% dplyr::group_by(condition) %>% dplyr::mutate(m = mean(.data[[metric[i]]]))
violin_all[[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(condition, -m), color = condition, fill = condition)) +
geom_violin()+
geom_boxplot(width=0.1, color="grey", alpha=0.2) +
geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+
labs(x = metric[i])+
ylab("")+
ggtitle("A Violin plot of distributions of alpha metrics in different conditions")+
theme(legend.position="none")
if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index" & metric[i] != "menhinick"){
violin_all[[i]] <- violin_all[[i]] +
scale_x_continuous(trans = 'log10') +
xlab(paste(metric[i], "(log10)", sep = " "))
}
}
#plots for Shannon entropy
violin_all
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
Non-parametric test (does not assume normal distribution)
test <- list()
for (i in 1:length(metric)){
test[[i]] <- pairwise.wilcox.test(healthy_disease_2[[metric[i]]], healthy_disease_2$condition, p.adjust.method="none") %>%
broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
test[[i]]$p.value <- round(test[[i]]$p.value, digits = 17)
}
tests <- do.call(what = rbind, args = test)
table1 <- tests %>%
add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
arrange(p.value, parameter) %>%
flextable() %>%
bold(~ p.value < 0.05, 4) %>%
bold(~ p.adjusted < 0.05, 5) %>%
add_header_lines(values = "Results of the Wilcox test for distributions of different conditions")
table2 <- tests %>%
add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
arrange(parameter, group1) %>%
flextable() %>%
bold(~ p.value < 0.05, 4) %>%
bold(~ p.adjusted < 0.05, 5) %>%
add_header_lines(values = "Results of the Wilcox test for distributions of different conditions")
#table1
#table2
# Do Wilcox test to see which parameters show significant differenc between healthy and unhealthy samples
wilcox_p_value <- healthy_disease_2 %>%
summarise(Shannon = wilcox.test(shannon_entropy[condition == "healthy"], shannon_entropy[condition != "healthy"])$p.value,
Chao1 = wilcox.test(chao1[condition == "healthy"], chao1[condition != "healthy"])$p.value,
Menhinick = wilcox.test(menhinick[condition == "healthy"], menhinick[condition != "healthy"])$p.value,
Margalef = wilcox.test(margalef[condition == "healthy"], margalef[condition != "healthy"])$p.value,
Simpson = wilcox.test(simpson[condition == "healthy"], simpson[condition != "healthy"])$p.value,
Fisher = wilcox.test(fisher_alpha[condition == "healthy"], fisher_alpha[condition != "healthy"])$p.value,
Pielou = wilcox.test(pielou_evenness[condition == "healthy"], pielou_evenness[condition != "healthy"])$p.value,
Gini = wilcox.test(gini_index[condition == "healthy"], gini_index[condition != "healthy"])$p.value,
Strong = wilcox.test(strong[condition == "healthy"], strong[condition != "healthy"])$p.value,
Faith = wilcox.test(faith_pd[condition == "healthy"], faith_pd[condition != "healthy"])$p.value)
wilcox_p_value <- t(wilcox_p_value)
colnames(wilcox_p_value) <- c("p.value")
wilcox_p_value <- data.frame(Alpha_Metric = row.names(wilcox_p_value), wilcox_p_value)
wilcox_p_value$p.value <- round(wilcox_p_value$p.value, digits = 17)
wilcox_p_value %>%
flextable() %>%
bold(~ p.value < 0.05, 2) %>%
add_header_lines(values = "Results of the Wilcox test for distributions of different parameters between healthy and unhealthy samples")
Results of the Wilcox test for distributions of different parameters between healthy and unhealthy samples | |
Alpha_Metric | p.value |
Shannon | 0.00000001320300907 |
Chao1 | 0.00000000000000940 |
Menhinick | 0.00000000000000000 |
Margalef | 0.00000000000000272 |
Simpson | 0.00012397694654720 |
Fisher | 0.00000000000000000 |
Pielou | 0.00565682197111992 |
Gini | 0.00000000000000000 |
Strong | 0.00000002116645113 |
Faith | 0.00109525129338404 |
If we add CD samples from longitudinal study:
all_data <- rbind.fill(healthy_disease, before_trans, CD_2)
table(all_data$condition)
##
## CD CDI CDI + CD CDI + UC control crohns healthy UC
## 24 30 11 9 353 335 847 40
all_data$condition[all_data$condition == "crohns"] <-'CD'
all_data$condition[all_data$condition == "control"] <-'healthy'
violin_all_b <- vector('list', length(metric))
for (i in 1:length(metric)){
mean_line <- all_data %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
plot_data <- all_data %>% dplyr::group_by(condition) %>% dplyr::mutate(m = mean(.data[[metric[i]]]))
violin_all_b[[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(condition, -m), color = condition, fill = condition)) +
geom_violin()+
geom_boxplot(width=0.1, color="grey", alpha=0.2) +
geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+
labs(x = metric[i])+
ylab("")+
ggtitle("A Violin plot of distributions of alpha metrics in different conditions")+
theme(legend.position="none")
if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index" & metric[i] != "menhinick"){
violin_all_b[[i]] <- violin_all_b[[i]] +
scale_x_continuous(trans = 'log10') +
xlab(paste(metric[i], "(log10)", sep = " "))
}
}
#plots for Shannon entropy
violin_all_b
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=es_ES.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=es_ES.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggplotify_0.1.0 RColorBrewer_1.1-3 ComplexHeatmap_2.10.0
## [4] tibble_3.1.8 flextable_0.8.2 dplyr_1.0.10
## [7] purrr_0.3.5 plyr_1.8.7 gridExtra_2.3
## [10] ggplot2_3.4.0 stringr_1.4.1 data.table_1.14.4
## [13] readr_2.1.3
##
## loaded via a namespace (and not attached):
## [1] tidyr_1.2.1 sass_0.4.2 jsonlite_1.8.3
## [4] foreach_1.5.2 bslib_0.4.1 assertthat_0.2.1
## [7] highr_0.9 stats4_4.1.2 yulab.utils_0.0.5
## [10] yaml_2.3.6 gdtools_0.2.4 backports_1.4.1
## [13] pillar_1.8.1 glue_1.6.2 uuid_1.1-0
## [16] digest_0.6.30 colorspace_2.0-3 htmltools_0.5.3
## [19] pkgconfig_2.0.3 GetoptLong_1.0.5 broom_1.0.1
## [22] scales_1.2.1 officer_0.4.4 tzdb_0.3.0
## [25] generics_0.1.3 farver_2.1.1 IRanges_2.28.0
## [28] ellipsis_0.3.2 cachem_1.0.6 withr_2.5.0
## [31] BiocGenerics_0.40.0 cli_3.4.1 magrittr_2.0.3
## [34] crayon_1.5.2 evaluate_0.17 fansi_1.0.3
## [37] doParallel_1.0.17 xml2_1.3.3 tools_4.1.2
## [40] hms_1.1.2 GlobalOptions_0.1.2 lifecycle_1.0.3
## [43] matrixStats_0.61.0 S4Vectors_0.32.4 munsell_0.5.0
## [46] cluster_2.1.2 zip_2.2.0 compiler_4.1.2
## [49] jquerylib_0.1.4 systemfonts_1.0.4 gridGraphics_0.5-1
## [52] rlang_1.0.6 iterators_1.0.14 rstudioapi_0.14
## [55] rjson_0.2.21 circlize_0.4.15 base64enc_0.1-3
## [58] labeling_0.4.2 rmarkdown_2.17 gtable_0.3.1
## [61] codetools_0.2-18 DBI_1.1.3 R6_2.5.1
## [64] knitr_1.40 fastmap_1.1.0 utf8_1.2.2
## [67] clue_0.3-62 shape_1.4.6 stringi_1.7.8
## [70] parallel_4.1.2 Rcpp_1.0.8 vctrs_0.5.0
## [73] png_0.1-7 tidyselect_1.2.0 xfun_0.37